Data Loading

Load required libraries.

library(arm)
library(ggplot2)
library(grid)
library(dplyr)
library(tidyr)
library(RMySQL)
library(RCurl)
source('data_loading.R')

Load in Wordbank common data.

wordbank <- src_mysql(host = "54.149.39.46", dbname="wordbank",
                      user = "wordbank", password = "wordbank")

common.tables <- get.common.tables(wordbank)

admins <- get.administration.data(common.tables$momed,
                                  common.tables$child,
                                  common.tables$instrumentsmap,
                                  common.tables$administration) %>%
  filter(form == "WS", age > 15, age < 32) %>%
  mutate(age.group = cut(age, breaks = c(15, 19, 23, 27, 31)))

items <- get.item.data(common.tables$wordmapping,
                       common.tables$instrumentsmap,
                       common.tables$category)

instrument.tables <- get.instrument.tables(wordbank, common.tables$instruments)

# TEMPORARY HACK
instrument.tables$has_grammar <- c(0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0)

grammar.languages <- unique(filter(instrument.tables, has_grammar == 1)$language)
languages <- unique(instrument.tables$language)

Show number of items in each relevant section.

sections <- items %>%
  filter(form == "WS") %>%
  group_by(language, type) %>%
  summarise(n = n()) %>%
  spread(type, n) %>%
  select(language, word, word_form, complexity)
sections[is.na(sections)] = 0
kable(sections)
language word word_form complexity
Croatian 717 0 0
Danish 725 29 33
English 680 25 37
German 588 0 0
Italian 670 0 0
Norwegian 731 33 42
Russian 728 0 0
Spanish 680 24 37
Swedish 710 0 0
Turkish 711 0 0

Show total number of administrations in each language.

n.admin <- admins %>%
  group_by(language) %>%
  summarise(n = n())
kable(n.admin)
language n
Croatian 377
Danish 3038
English 5595
German 1183
Italian 658
Norwegian 10095
Russian 773
Spanish 1094
Swedish 900
Turkish 1861

Show number of administrations in each language by age group.

n.admin.age <- admins %>% 
  group_by(language, age.group) %>% 
  summarise(n = n()) %>%
  spread(age.group, n)
kable(n.admin.age)
language (15,19] (19,23] (23,27] (27,31]
Croatian 81 105 114 77
Danish 725 857 736 720
English 2020 947 1241 1387
German 164 364 380 275
Italian 70 227 194 167
Norwegian 1236 2892 3176 2791
Russian 83 195 239 256
Spanish 260 322 290 222
Swedish 311 307 143 139
Turkish 465 500 460 436

Some utility functions for transforming data values.

get.coded.type <- function(type, complexity_category) {
  if (type == "complexity") {
    return(complexity_category)
    } else {
      return(type)
      }
  }

get.value <- function(type, value) {
  if (type == "word_form" | type == "word") {
    return(value == "produces")
    } else if (type == "complexity") {
      return(value == "complex")
      }
  }

Analysis 1: Syntax and Morphology

Get kid by item data for wordform and complexity items all languages and aggregate them.

get.grammar.data <- function(lang) {
  
  lang.grammar.items <- items %>%
    filter(language == lang, form == "WS",
           type == "word_form" | type == "complexity") %>%
    rename(column = item.id) %>%
    mutate(item.id = as.numeric(substr(column, 6, nchar(column)))) %>%
    select(column, item.id, type, item, definition, complexity_category)
  
  lang.instrument.table <- filter(instrument.tables, language == lang,
                                  form == "WS")$table[[1]]
  
  lang.grammar.data <- get.instrument.data(lang.instrument.table,
                                           lang.grammar.items$column) %>%
    left_join(lang.grammar.items) %>%
    group_by(data_id, type) %>%
    mutate(no_section = all(is.na(value))) %>%
    filter(!no_section) %>%
    mutate(value = ifelse(is.na(value), "", value),
           value = get.value(unique(type), value),
           coded.type = get.coded.type(unique(type), complexity_category),
           coded.type = factor(coded.type,
                               levels = c("word_form", "morphology", "syntax"),
                               labels = c("Word Form",
                                          "Complexity (Morphological)",
                                          "Complexity (Syntactic)")),
           measure = factor(type, levels = c("word_form", "complexity"),
                            labels = c("Word Form", "Complexity"))) %>%
    ungroup() %>%
    select(-complexity_category, -no_section, -type, -column)
  
  num.words <- nrow(filter(items, language == lang, form == "WS", type == "word"))
  
  lang.admins <- admins %>%
    filter(language == lang) %>%
    select(data_id, age, age.group, production, language) %>%
    mutate(vocab.mean = production / num.words)
  
  lang.data <- left_join(lang.grammar.data, lang.admins) %>%
    filter(age > 15 & age < 32)
  
  return(lang.data)
  
  }

grammar.data <- bind_rows(sapply(grammar.languages, get.grammar.data, simplify = FALSE))

Get by kid summary data for all languages.

grammar.summary <- grammar.data %>%
  group_by(language, measure, data_id, age, age.group, vocab.mean) %>%
  summarise(sum = sum(value),
            diff = length(value) - sum,
            mean = sum / length(value))

Fit grammar score models and use them to predict data.

grammar.models <- grammar.summary %>%
  group_by(language, measure) %>%
  do(model = glm(cbind(sum, diff) ~ vocab.mean + age.group,
                 data = ., family="binomial"))

get.grammar.model <- function(lang, meas) {
  return(filter(grammar.models, language == lang, measure == meas)$model[[1]])
  }

grammar.predicted.data <- grammar.summary %>%
  group_by(language, measure) %>%
  mutate(predicted = invlogit(predict.lm(get.grammar.model(unique(language),
                                                           unique(measure)),
                                         data = ., family="binomial")))

Plot score as a function of vocabulary size for each language and measure with model prediction curves.

ggplot(grammar.predicted.data, aes(x = vocab.mean, y = mean, 
                                   colour = age.group, fill = age.group,
                                   label = age.group)) + 
  geom_jitter(alpha=.3, size=.75) +
  geom_line(aes(y=predicted),size=0.65) + 
  facet_grid(language~measure) + 
  scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.2),
                     name = "\nVocabulary Size") + 
  scale_y_continuous(limits = c(0,1), breaks = seq(0,1,.25),
                     "Score (Mean Items)\n") + 
  theme_bw(base_size = 11) +
  theme(legend.position = c(0.06,0.92),
        legend.text = element_text(size=7),
        legend.title = element_text(size=7),
        legend.key.height = unit(0.7, "char"),
        legend.key.width = unit(0.4, "cm"),
        legend.key = element_blank(),
        legend.background = element_rect(fill="transparent"),
        text=element_text(family=font)) +
  scale_color_brewer(type="div", palette=9,
                     name="Age Group\n (months)") +
  scale_fill_brewer(palette = "Spectral",
                    guide=FALSE)

Model comparison: fit grammar models and get their AICs and age coefficients.

grammar.model.metrics <- grammar.summary %>%
  group_by(language, measure) %>%
  do(model.vocab = glm(cbind(sum,diff) ~ vocab.mean, data = .,
                       family="binomial"),
     model.vocab.age = glm(cbind(sum,diff) ~ vocab.mean + age, data = .,
                           family="binomial"),
     model.vocab.age = glm(cbind(sum,diff) ~ vocab.mean * age, data = .,
                           family="binomial")) %>%
  mutate(AIC.vocab = AIC(model.vocab),
         AIC.vocab.age = AIC(model.vocab.age),
         deltaAIC = AIC.vocab - AIC.vocab.age,
         age.coef = coef(model.vocab.age)["age"],
         age.se = se.coef(model.vocab.age)["age"])

Show AICs of grammar models.

kable(select(grammar.model.metrics,
             language, measure, AIC.vocab, AIC.vocab.age, deltaAIC))
language measure AIC.vocab AIC.vocab.age deltaAIC
Danish Word Form 11529.402 11508.696 20.70574
Danish Complexity 13774.059 13324.288 449.77111
English Word Form 18762.575 18664.378 98.19717
English Complexity 30320.285 28695.682 1624.60326
Norwegian Word Form 52851.103 52771.409 79.69365
Norwegian Complexity 77467.584 74612.829 2854.75512
Spanish Word Form 6285.671 6172.703 112.96810
Spanish Complexity 10596.578 9700.317 896.26133

Plot age effect coefficients for each language and measure.

ggplot(grammar.model.metrics, 
       aes(x=language, y=age.coef, fill=measure)) + 
  geom_bar(position="dodge", stat="identity") + 
  geom_linerange(aes(ymin=age.coef-1.96*age.se, ymax=age.coef+1.96*age.se), 
                 position = position_dodge(width=.9)) +
  ylab("Age effect coefficient") + 
  xlab("") +
  theme_bw(base_size = 14) +
  scale_fill_brewer(palette="Set2",
                    name="Measure") +
  theme(legend.position = c(0.15,0.86),
        legend.text = element_text(size=14),
        legend.title = element_text(size=15),
        legend.key.height = unit(1.5, "char"),
        legend.key = element_blank(),
        legend.background = element_rect(fill="transparent"),
        text=element_text(family=font))

Fit models for each wordform and complexity item and get their age coefficients.

item.models <- grammar.data %>%
  group_by(language, item, coded.type) %>%
  do(model = glm(value ~ vocab.mean + age, data = ., family = "binomial")) %>%
  mutate(coef = coef(model)["age"],
         se = se.coef(model)["age"])

Function for plotting age effect coefficients by item for a language.

plot.item.coefs <- function(item.models, lang) {
  
  lang.item.models <- filter(item.models, language == lang) %>%
    arrange(coef) %>%
    mutate(item = factor(item, levels=item))
  
  item.plot <- ggplot(lang.item.models,
                      aes(x=item, y=coef, fill=coded.type, label=item)) +
    geom_bar(stat="identity", position="identity", alpha=.5, width=0.9) +
    geom_linerange(aes(ymin=coef-1.96*se, ymax=coef+1.96*se),
                   position = position_dodge(width=.9)) +
    theme_bw(base_size = 12) +
    scale_y_continuous(name="Age Effect Coefficient") +
    scale_x_discrete(name="",breaks=NULL) +
    annotate("text", x = length(lang.item.models$item)/2,
             y = min(lang.item.models$coef-1.96*lang.item.models$se), vjust=0,
             label = lang, size = 9, family=font) +
    scale_fill_brewer(palette="Set2", name="Item Type", drop=FALSE) +
    theme(legend.position = c(0.22,0.82),
          legend.text = element_text(size=14),
          legend.title = element_text(size=13),
          legend.key = element_blank(),
          legend.key.height = unit(1.5, "char"),
          text = element_text(family=font),
          axis.title.y = element_text(size=16),
          axis.text.y = element_text(size=13))
  
  return(item.plot)
  
  }

Plot item interactions for Norwegian.

plot.item.coefs(item.models, "Norwegian")

Plot item interactions for English.

plot.item.coefs(item.models, "English")

Plot item interactions for Danish.

plot.item.coefs(item.models, "Danish")

Plot item interactions for Spanish.

plot.item.coefs(item.models, "Spanish")


Analysis 2: Vocabulary Composition

Get vocabulary composition data for all languages.

get.vocab.composition <- function(lang) {
  
  lang.vocab.items <- filter(items, language == lang, form == "WS", type == "word") %>%
    rename(column = item.id) %>%
    mutate(item.id = as.numeric(substr(column, 6, nchar(column))))
  
  lang.instrument.table <- filter(instrument.tables, language == lang,
                                  form == "WS")$table[[1]]
  
  lang.vocab.data <- get.instrument.data(lang.instrument.table,
                                         lang.vocab.items$column) %>%
    left_join(select(lang.vocab.items, item.id, lexical_category, item, definition)) %>%
    mutate(value = ifelse(is.na(value), "", value),
           value = get.value("word", value))
  
  num.words <- nrow(lang.vocab.items)
  
  lang.admins <- admins %>%
    filter(language == lang) %>%
    select(data_id, age, age.group, production, language) %>%
    mutate(vocab.mean = production / num.words)
  
  lang.vocab.summary <- left_join(lang.vocab.data, lang.admins) %>%
    filter(age > 15 & age < 32) %>%
    group_by(data_id, lexical_category, age, age.group, vocab.mean, language) %>%
    summarise(sum = sum(value),
              diff = length(value) - sum,
              mean = sum / length(value))
  
  return(lang.vocab.summary)
  
  }

vocab.composition <- bind_rows(sapply(languages, get.vocab.composition,
                                      simplify = FALSE)) %>%
  filter(lexical_category != "other", lexical_category != "unknown") %>%
  mutate(lexical_category = factor(lexical_category,
                                   levels=c("nouns", "predicates", "function_words"),
                                   labels=c("Nouns", "Predicates", "Function Words")))

Fit vocabulary composition models and use them to predict data.

vocab.models <- vocab.composition %>%
  group_by(language, lexical_category) %>%
  do(model = glm(cbind(sum, diff) ~ vocab.mean,
                 data = ., family = "binomial"))

get.vocab.model <- function(lang, cat) {
  return(filter(vocab.models, language == lang, lexical_category == cat)$model[[1]])
  }

vocab.predicted.data <- vocab.composition %>%
  group_by(language, lexical_category) %>%
  mutate(predicted = invlogit(predict.lm(get.vocab.model(unique(language),
                                                         unique(lexical_category)),
                                         data = ., family = "binomial")))

Plot vocabulary composition as a function of vocabulary size for each language with model prediction curves.

ggplot(vocab.predicted.data,
       aes(x=vocab.mean, y=mean, colour=lexical_category, label=lexical_category)) +
  geom_jitter(alpha=0.15, size=.75) +
  geom_line(aes(y=predicted),size=0.65) + 
  facet_wrap(~ language) +
  scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),
                     name = "Proportion of Category\n") +
  scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.2),
                     name = "\nVocabulary Size") +
  theme_bw(base_size=12) + 
  theme(legend.position = c(0.065,0.95),
        legend.text = element_text(size=9),
        legend.title = element_text(size=9, lineheight=unit(0.8, "char")),
        legend.key.height = unit(0.8, "char"),
        legend.key.width = unit(0.3, "cm"),
        legend.key = element_blank(),
        legend.background = element_rect(fill="transparent"),
        text = element_text(family=font)) +
  scale_color_brewer(palette = "Set2", name = "Lexical Category")

Fit vocabulary composition models and get their AICs and age coefficients.

vocab.model.metrics <- vocab.composition %>%
  group_by(language, lexical_category) %>%
  do(model.vocab = glm(cbind(sum, diff) ~ vocab.mean, data = .,
                       family="binomial"),
     model.vocab.age = glm(cbind(sum, diff) ~ vocab.mean + age, data = .,
                           family="binomial")) %>%
  mutate(AIC.vocab = AIC(model.vocab),
         AIC.vocab.age = AIC(model.vocab.age),
         deltaAIC = AIC.vocab - AIC.vocab.age,
         age.coef = coef(model.vocab.age)["age"],
         age.se = se.coef(model.vocab.age)["age"])

Show AICs of vocabulary composition models.

kable(select(vocab.model.metrics,
             language, lexical_category, AIC.vocab, AIC.vocab.age, deltaAIC))
language lexical_category AIC.vocab AIC.vocab.age deltaAIC
Croatian Nouns 6408.926 6291.078 117.8476177
Croatian Predicates 4179.132 4181.121 -1.9889332
Croatian Function Words 4341.321 4330.275 11.0456943
Danish Nouns 52872.674 52609.708 262.9656407
Danish Predicates 27314.629 27223.805 90.8244673
Danish Function Words 27541.197 27393.853 147.3439923
English Nouns 86115.764 86107.445 8.3184184
English Predicates 51090.656 50878.709 211.9466748
English Function Words 53702.533 53404.564 297.9689868
German Nouns 16395.952 16387.158 8.7931039
German Predicates 9384.208 9379.200 5.0088950
German Function Words 11516.074 11466.015 50.0597137
Italian Nouns 9899.290 9892.355 6.9350636
Italian Predicates 6940.561 6940.308 0.2539017
Italian Function Words 7052.579 6845.269 207.3103058
Norwegian Nouns 148432.089 148423.959 8.1295136
Norwegian Predicates 90151.776 89814.037 337.7392848
Norwegian Function Words 104555.150 104021.522 533.6285278
Russian Nouns 13121.501 13113.062 8.4394271
Russian Predicates 8146.314 8110.742 35.5720150
Russian Function Words 8644.789 8226.644 418.1454301
Spanish Nouns 19414.464 19368.882 45.5813467
Spanish Predicates 13596.813 13594.366 2.4472487
Spanish Function Words 13253.564 13121.481 132.0835721
Swedish Nouns 15355.704 15116.197 239.5076328
Swedish Predicates 8272.710 8128.340 144.3698614
Swedish Function Words 7046.174 6840.327 205.8470912
Turkish Nouns 27188.128 27165.389 22.7390742
Turkish Predicates 27484.975 27387.501 97.4744903
Turkish Function Words 20898.234 20889.244 8.9896409

Plot age effect coefficients for each language and lexical category.

ggplot(vocab.model.metrics, 
       aes(x=language, y=age.coef, fill=lexical_category)) + 
  geom_bar(position="dodge", stat="identity") + 
  geom_linerange(aes(ymin=age.coef-1.96*age.se, ymax=age.coef+1.96*age.se), 
                 position = position_dodge(width=.9)) +
  ylab("Age effect coefficient") + 
  xlab("") +
  theme_bw(base_size = 14) +
  scale_fill_brewer(palette = "Set2",
                    name = "Lexical Category") +
  theme(legend.position = c(0.115,0.82),
        legend.text = element_text(size=13),
        legend.title = element_text(size=13),
        legend.key.height = unit(1.5, "char"),
        legend.key = element_blank(),
        legend.background = element_rect(fill="transparent"),
        text = element_text(family=font))